{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 准备工作"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import time\n",
    "from random import random\n",
    "import selenium"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-2-8ae4025e7ff4>:19: DeprecationWarning: use options instead of chrome_options\n",
      "  driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,\n"
     ]
    }
   ],
   "source": [
    "from selenium import webdriver\n",
    "from selenium.webdriver.common.desired_capabilities import DesiredCapabilities\n",
    "\n",
    "\n",
    "#caps=dict()\n",
    "#caps[\"pageLoadStrategy\"] = \"none\"   # Do not wait for full page load\n",
    "\n",
    "opts = webdriver.ChromeOptions()\n",
    "opts.add_argument('--no-sandbox')#解决DevToolsActivePort文件不存在的报错\n",
    "opts.add_argument('window-size=1920x3000') #指定浏览器分辨率\n",
    "opts.add_argument('--disable-gpu') #谷歌文档提到需要加上一这个属性来规避bug\n",
    "opts.add_argument('--hide-scrollbars') #隐藏滚动条, 应对些特殊页面\n",
    "#opts.add_argument('blink-settings=imagesEnabled=false') #不加载图片, 提升速度\n",
    "#opts.add_argument('--headless') #浏览器不提供可视化页面. linux下如果系统不支持可视化不加这条会启动失败\n",
    "# opts.binary_location = \"C:\\portable\\PortableApps\\IronPortable\\App\\Iron\\chrome.exe\"\n",
    "# opts.binary_location = \"C:\\Program Files\\Google\\Chrome\\Application\\chromedriver.exe\" #\"H:\\_coding_\\Gitee\\InternetNewMedia\\CapstonePrj2016\\chromedriver.exe\"  \n",
    "\n",
    "\n",
    "driver = webdriver.Chrome( chrome_options = opts) #desired_capabilities=caps,"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.get(\"https://www.cnki.net/\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 检查是否登录"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "driver.find_element_by_xpath('//*[@id=\"headerBox\"]/div[1]/div/div/div[4]').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 校园网使用ip登录"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_id('Button2')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'中山大学南方学院'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_id('Ecp_loginShowName1')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 点击高级检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_id('highSearch')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 检查窗口位置\n",
    "* 当出现多个窗口时，一定要先检查窗口位置\n",
    "* 每一个窗口在driver中自动生成唯一的窗口id"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CDwindow-8322673B5971023DF6F39775220ACC51'"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.current_window_handle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-8322673B5971023DF6F39775220ACC51',\n",
       " 'CDwindow-F776314EA433BB279381CDFE42622F57']"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 切换窗口位置"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-12-6c6d5ce6602d>:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 点击期刊检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//ul[@class=\"doctype-menus keji\"]/li[@data-id=\"xsqk\"]/a')\n",
    "element.get_attribute('innerHTML')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 勾选期刊类型"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# SCI\n",
    "element = driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"SI\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "# EI\n",
    "element = driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"EI\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 北大核心\n",
    "element = driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"HX\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# CSSCI\n",
    "element = driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"CSI\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# CSCD\n",
    "element = driver.find_element_by_xpath('//div[@class=\"extend-tit-labels\"]//input[@key=\"CSD\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 填写query\n",
    "* 可以在高级检索直接检索（只需要不精确查找）\n",
    "* 建议 专业检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "## 点击专业检索\n",
    "driver.find_element_by_name('majorSearch').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "AI_新媒体_query = 'SU = \"新媒体\" AND  (TI =\"数据\" OR TI = \"人工智能\" OR TI = \"信息可视化\")'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//textarea')\n",
    "element.clear()\n",
    "element.send_keys(AI_新媒体_query)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 点击检索"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//input[@value=\"检索\"]')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"id_grid_display_num\"]/div')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "element = driver.find_element_by_xpath('//*[@id=\"perPageDiv\"]/ul/li[3]/a')\n",
    "element.click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 抓取页面信息"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>951</td>\n",
       "      <td>广电大数据平台建设思路及架构研究</td>\n",
       "      <td>仲岩</td>\n",
       "      <td>科技创新导报</td>\n",
       "      <td>2016-12-27 18:57</td>\n",
       "      <td>1.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>952</td>\n",
       "      <td>大数据背景下恐怖主义信息的新媒体传播研究:关键问题与重要议题</td>\n",
       "      <td>石小川; 吴世文; 闫岩</td>\n",
       "      <td>湖北社会科学</td>\n",
       "      <td>2016-12-27 13:59</td>\n",
       "      <td>12.0</td>\n",
       "      <td>785.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>953</td>\n",
       "      <td>互联网大数据背景下辽宁旅游目的地的推广研究</td>\n",
       "      <td>杨晓蕾</td>\n",
       "      <td>旅游纵览(下半月)</td>\n",
       "      <td>2016-12-23</td>\n",
       "      <td>2.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>954</td>\n",
       "      <td>国土资源工作中数据成果保密工作研究</td>\n",
       "      <td>张明昭</td>\n",
       "      <td>中国管理信息化</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>955</td>\n",
       "      <td>大数据背景下的媒介技术批判研究</td>\n",
       "      <td>闵会轩</td>\n",
       "      <td>新闻前哨</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>94.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>956</td>\n",
       "      <td>媒体变革、超级链接、风云际会 媒超风:一场全渠道大数据的跨媒体运动</td>\n",
       "      <td>吴熙</td>\n",
       "      <td>安家</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>30.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>957</td>\n",
       "      <td>大数据背景下新媒体艺术产业创新模式研究</td>\n",
       "      <td>门丽; 丁宁</td>\n",
       "      <td>科技风</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>85.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>958</td>\n",
       "      <td>Lambda架构在处理海量高并发数据中的应用</td>\n",
       "      <td>韩嫕</td>\n",
       "      <td>现代电视技术</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>8.0</td>\n",
       "      <td>139.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>959</td>\n",
       "      <td>大数据和新媒体背景下高校档案工作发展的对策</td>\n",
       "      <td>安琪</td>\n",
       "      <td>黑龙江档案</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>5.0</td>\n",
       "      <td>86.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>960</td>\n",
       "      <td>新媒体大数据时代新闻编辑观的转型</td>\n",
       "      <td>缪和友</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>2.0</td>\n",
       "      <td>96.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>961</td>\n",
       "      <td>European Data Protection Regulation and Online...</td>\n",
       "      <td>David Erdos</td>\n",
       "      <td>Journal of Law and Society</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>962</td>\n",
       "      <td>基于数据深度应用的中小学课堂教学变革</td>\n",
       "      <td>翟纯</td>\n",
       "      <td>教育导刊</td>\n",
       "      <td>2016-12-10</td>\n",
       "      <td>1.0</td>\n",
       "      <td>99.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>963</td>\n",
       "      <td>新媒体大数据背景下整合营销传播的新形态</td>\n",
       "      <td>徐苏杭</td>\n",
       "      <td>今传媒</td>\n",
       "      <td>2016-12-05</td>\n",
       "      <td>10.0</td>\n",
       "      <td>619.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>964</td>\n",
       "      <td>“一体两翼”:大数据时代地方广电媒体转型之策</td>\n",
       "      <td>关琮严; 宋春光</td>\n",
       "      <td>中国记者</td>\n",
       "      <td>2016-12-01</td>\n",
       "      <td>2.0</td>\n",
       "      <td>90.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>965</td>\n",
       "      <td>广电网络大数据应用的思考</td>\n",
       "      <td>李垣</td>\n",
       "      <td>声屏世界</td>\n",
       "      <td>2016-12-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>23.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>966</td>\n",
       "      <td>数字化新媒体下数据分析技术的应用研究</td>\n",
       "      <td>段汝林</td>\n",
       "      <td>教育现代化</td>\n",
       "      <td>2016-11-28</td>\n",
       "      <td>1.0</td>\n",
       "      <td>84.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>967</td>\n",
       "      <td>城乡居民的媒体使用及其影响因素研究——基于C G SS2013数据的分析</td>\n",
       "      <td>黄俊华</td>\n",
       "      <td>新闻界</td>\n",
       "      <td>2016-11-25</td>\n",
       "      <td>7.0</td>\n",
       "      <td>351.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>968</td>\n",
       "      <td>论新媒体环境下加强大学生责任意识的教育引导——基于部分高校调查数据的实证分析</td>\n",
       "      <td>王允端</td>\n",
       "      <td>思想理论教育导刊</td>\n",
       "      <td>2016-11-20</td>\n",
       "      <td>27.0</td>\n",
       "      <td>747.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>969</td>\n",
       "      <td>大数据环境下中国网络剧商业模式新特征</td>\n",
       "      <td>邓思思</td>\n",
       "      <td>中国广播</td>\n",
       "      <td>2016-11-20</td>\n",
       "      <td>1.0</td>\n",
       "      <td>120.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>970</td>\n",
       "      <td>大数据技术在广电新媒体中的应用与发展</td>\n",
       "      <td>胡士杰</td>\n",
       "      <td>创新科技</td>\n",
       "      <td>2016-11-15</td>\n",
       "      <td>2.0</td>\n",
       "      <td>82.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>20</th>\n",
       "      <td>971</td>\n",
       "      <td>Computing; Study Data from University of Hong ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computer Weekly News</td>\n",
       "      <td>2016-11-09</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>21</th>\n",
       "      <td>972</td>\n",
       "      <td>人文社科类学术期刊微信公众平台的发展——基于533种CSSCI(2014—2015)来源期刊...</td>\n",
       "      <td>冀芳; 王召露; 张夏恒</td>\n",
       "      <td>科技与出版</td>\n",
       "      <td>2016-11-08</td>\n",
       "      <td>21.0</td>\n",
       "      <td>532.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>22</th>\n",
       "      <td>973</td>\n",
       "      <td>视觉 数据 叙事:媒介融合环境下的奥运报道创新</td>\n",
       "      <td>陈欣钢</td>\n",
       "      <td>电视研究</td>\n",
       "      <td>2016-11-05</td>\n",
       "      <td>3.0</td>\n",
       "      <td>448.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>23</th>\n",
       "      <td>974</td>\n",
       "      <td>“大数据小+书包”推进理论学习 河西党建进入“互联网+”时代</td>\n",
       "      <td>NaN</td>\n",
       "      <td>求知</td>\n",
       "      <td>2016-11-05</td>\n",
       "      <td>1.0</td>\n",
       "      <td>92.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>24</th>\n",
       "      <td>975</td>\n",
       "      <td>中国数据新闻发展的现状、困境及对策</td>\n",
       "      <td>陈积银; 杨廉</td>\n",
       "      <td>新闻记者</td>\n",
       "      <td>2016-11-05</td>\n",
       "      <td>11.0</td>\n",
       "      <td>1477.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>25</th>\n",
       "      <td>976</td>\n",
       "      <td>大数据背景下新媒体发展问题和对策</td>\n",
       "      <td>顾翠芬</td>\n",
       "      <td>中国管理信息化</td>\n",
       "      <td>2016-11-01</td>\n",
       "      <td>5.0</td>\n",
       "      <td>193.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>26</th>\n",
       "      <td>977</td>\n",
       "      <td>大数据时代下的新媒体新发展</td>\n",
       "      <td>郎清平</td>\n",
       "      <td>声屏世界·广告人</td>\n",
       "      <td>2016-11-01</td>\n",
       "      <td>2.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>27</th>\n",
       "      <td>978</td>\n",
       "      <td>Silicon; Data on Silicon Detailed by Researche...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Science Letter</td>\n",
       "      <td>2016-10-28</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>28</th>\n",
       "      <td>979</td>\n",
       "      <td>大数据时代新闻可视化传播的创新路径</td>\n",
       "      <td>于新玲</td>\n",
       "      <td>科技传播</td>\n",
       "      <td>2016-10-23</td>\n",
       "      <td>3.0</td>\n",
       "      <td>134.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>29</th>\n",
       "      <td>980</td>\n",
       "      <td>高校思想政治教育研究现状分析——基于文献大数据视角</td>\n",
       "      <td>杨辉; 白青利; 张勇</td>\n",
       "      <td>石油教育</td>\n",
       "      <td>2016-10-20</td>\n",
       "      <td>2.0</td>\n",
       "      <td>248.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>30</th>\n",
       "      <td>981</td>\n",
       "      <td>拥抱大数据</td>\n",
       "      <td>NaN</td>\n",
       "      <td>东南传播</td>\n",
       "      <td>2016-10-20</td>\n",
       "      <td>NaN</td>\n",
       "      <td>13.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>31</th>\n",
       "      <td>982</td>\n",
       "      <td>大数据时代下传统媒体与新媒体的发展困境与趋势分析</td>\n",
       "      <td>汪跃文</td>\n",
       "      <td>传播与版权</td>\n",
       "      <td>2016-10-15</td>\n",
       "      <td>12.0</td>\n",
       "      <td>300.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>32</th>\n",
       "      <td>983</td>\n",
       "      <td>运用大数据促进信息公开环境共治</td>\n",
       "      <td>NaN</td>\n",
       "      <td>环境保护与循环经济</td>\n",
       "      <td>2016-10-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>62.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>33</th>\n",
       "      <td>984</td>\n",
       "      <td>大数据背景下大学生职业信息库建设路径探析</td>\n",
       "      <td>刘紫婷</td>\n",
       "      <td>黄河水利职业技术学院学报</td>\n",
       "      <td>2016-10-15</td>\n",
       "      <td>5.0</td>\n",
       "      <td>102.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34</th>\n",
       "      <td>985</td>\n",
       "      <td>大数据时代的精准广告及其传播策略——基于场域理论视角的分析</td>\n",
       "      <td>张勤</td>\n",
       "      <td>视听</td>\n",
       "      <td>2016-10-15</td>\n",
       "      <td>7.0</td>\n",
       "      <td>414.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>35</th>\n",
       "      <td>986</td>\n",
       "      <td>人工智能与新媒体的进化路径</td>\n",
       "      <td>刘芬</td>\n",
       "      <td>中国传媒科技</td>\n",
       "      <td>2016-10-15</td>\n",
       "      <td>13.0</td>\n",
       "      <td>1127.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>36</th>\n",
       "      <td>987</td>\n",
       "      <td>大数据时代的媒体融合之路</td>\n",
       "      <td>初小燕</td>\n",
       "      <td>新媒体研究</td>\n",
       "      <td>2016-10-12 14:12</td>\n",
       "      <td>2.0</td>\n",
       "      <td>116.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>37</th>\n",
       "      <td>988</td>\n",
       "      <td>探讨大数据时代广播新闻的转型策略</td>\n",
       "      <td>刘江帆</td>\n",
       "      <td>新闻研究导刊</td>\n",
       "      <td>2016-10-10</td>\n",
       "      <td>1.0</td>\n",
       "      <td>27.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>38</th>\n",
       "      <td>989</td>\n",
       "      <td>大数据环境下基于移动客户端的传统媒体转型思路</td>\n",
       "      <td>徐皞亮</td>\n",
       "      <td>新闻世界</td>\n",
       "      <td>2016-10-10</td>\n",
       "      <td>1.0</td>\n",
       "      <td>149.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>39</th>\n",
       "      <td>990</td>\n",
       "      <td>大数据、新媒体技术在广电中的应用和前景简析</td>\n",
       "      <td>吴量</td>\n",
       "      <td>科技传播</td>\n",
       "      <td>2016-10-08</td>\n",
       "      <td>6.0</td>\n",
       "      <td>121.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>40</th>\n",
       "      <td>991</td>\n",
       "      <td>试析数据新闻报道的形象化</td>\n",
       "      <td>王振宇</td>\n",
       "      <td>编辑之友</td>\n",
       "      <td>2016-10-05</td>\n",
       "      <td>1.0</td>\n",
       "      <td>215.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>41</th>\n",
       "      <td>992</td>\n",
       "      <td>中国邮政与腾讯达成战略合作,布局新媒体与大数据</td>\n",
       "      <td>NaN</td>\n",
       "      <td>中国储运</td>\n",
       "      <td>2016-10-01</td>\n",
       "      <td>NaN</td>\n",
       "      <td>61.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>42</th>\n",
       "      <td>993</td>\n",
       "      <td>大数据与高校学生工作的实践创新</td>\n",
       "      <td>于乐; 张凤寒; 李玉纯; 杨直凡</td>\n",
       "      <td>中华文化论坛</td>\n",
       "      <td>2016-09-30</td>\n",
       "      <td>3.0</td>\n",
       "      <td>445.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>43</th>\n",
       "      <td>994</td>\n",
       "      <td>Computer Processing and Data Preparation; New ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computers, Networks &amp; Communications</td>\n",
       "      <td>2016-09-29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>44</th>\n",
       "      <td>995</td>\n",
       "      <td>Computer Processing and Data Preparation; New ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computers Networks &amp; Communications</td>\n",
       "      <td>2016-09-29</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>45</th>\n",
       "      <td>996</td>\n",
       "      <td>城市电视台在新媒体环境下发展数据新闻的思考</td>\n",
       "      <td>赵慧; 蔡凌楚</td>\n",
       "      <td>科技传播</td>\n",
       "      <td>2016-09-23</td>\n",
       "      <td>1.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>46</th>\n",
       "      <td>997</td>\n",
       "      <td>大数据时代下新媒体的传播优势解析</td>\n",
       "      <td>刘力豪</td>\n",
       "      <td>科技传播</td>\n",
       "      <td>2016-09-23</td>\n",
       "      <td>1.0</td>\n",
       "      <td>159.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>47</th>\n",
       "      <td>998</td>\n",
       "      <td>大数据背景下传媒对我国公共体育的影响</td>\n",
       "      <td>吴立新</td>\n",
       "      <td>电子测试</td>\n",
       "      <td>2016-09-22 17:04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>69.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>48</th>\n",
       "      <td>999</td>\n",
       "      <td>Computer Processing and Data Preparation; New ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computers, Networks &amp; Communications</td>\n",
       "      <td>2016-09-22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>49</th>\n",
       "      <td>1000</td>\n",
       "      <td>Computer Processing and Data Preparation; New ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computers Networks &amp; Communications</td>\n",
       "      <td>2016-09-22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "    Unnamed: 0                                                 篇名  \\\n",
       "0          951                                   广电大数据平台建设思路及架构研究   \n",
       "1          952                     大数据背景下恐怖主义信息的新媒体传播研究:关键问题与重要议题   \n",
       "2          953                              互联网大数据背景下辽宁旅游目的地的推广研究   \n",
       "3          954                                  国土资源工作中数据成果保密工作研究   \n",
       "4          955                                    大数据背景下的媒介技术批判研究   \n",
       "5          956                  媒体变革、超级链接、风云际会 媒超风:一场全渠道大数据的跨媒体运动   \n",
       "6          957                                大数据背景下新媒体艺术产业创新模式研究   \n",
       "7          958                             Lambda架构在处理海量高并发数据中的应用   \n",
       "8          959                              大数据和新媒体背景下高校档案工作发展的对策   \n",
       "9          960                                   新媒体大数据时代新闻编辑观的转型   \n",
       "10         961  European Data Protection Regulation and Online...   \n",
       "11         962                                 基于数据深度应用的中小学课堂教学变革   \n",
       "12         963                                新媒体大数据背景下整合营销传播的新形态   \n",
       "13         964                             “一体两翼”:大数据时代地方广电媒体转型之策   \n",
       "14         965                                       广电网络大数据应用的思考   \n",
       "15         966                                 数字化新媒体下数据分析技术的应用研究   \n",
       "16         967               城乡居民的媒体使用及其影响因素研究——基于C G SS2013数据的分析   \n",
       "17         968             论新媒体环境下加强大学生责任意识的教育引导——基于部分高校调查数据的实证分析   \n",
       "18         969                                 大数据环境下中国网络剧商业模式新特征   \n",
       "19         970                                 大数据技术在广电新媒体中的应用与发展   \n",
       "20         971  Computing; Study Data from University of Hong ...   \n",
       "21         972  人文社科类学术期刊微信公众平台的发展——基于533种CSSCI(2014—2015)来源期刊...   \n",
       "22         973                            视觉 数据 叙事:媒介融合环境下的奥运报道创新   \n",
       "23         974                     “大数据小+书包”推进理论学习 河西党建进入“互联网+”时代   \n",
       "24         975                                  中国数据新闻发展的现状、困境及对策   \n",
       "25         976                                   大数据背景下新媒体发展问题和对策   \n",
       "26         977                                      大数据时代下的新媒体新发展   \n",
       "27         978  Silicon; Data on Silicon Detailed by Researche...   \n",
       "28         979                                  大数据时代新闻可视化传播的创新路径   \n",
       "29         980                          高校思想政治教育研究现状分析——基于文献大数据视角   \n",
       "30         981                                              拥抱大数据   \n",
       "31         982                           大数据时代下传统媒体与新媒体的发展困境与趋势分析   \n",
       "32         983                                    运用大数据促进信息公开环境共治   \n",
       "33         984                               大数据背景下大学生职业信息库建设路径探析   \n",
       "34         985                      大数据时代的精准广告及其传播策略——基于场域理论视角的分析   \n",
       "35         986                                      人工智能与新媒体的进化路径   \n",
       "36         987                                       大数据时代的媒体融合之路   \n",
       "37         988                                   探讨大数据时代广播新闻的转型策略   \n",
       "38         989                             大数据环境下基于移动客户端的传统媒体转型思路   \n",
       "39         990                              大数据、新媒体技术在广电中的应用和前景简析   \n",
       "40         991                                       试析数据新闻报道的形象化   \n",
       "41         992                            中国邮政与腾讯达成战略合作,布局新媒体与大数据   \n",
       "42         993                                    大数据与高校学生工作的实践创新   \n",
       "43         994  Computer Processing and Data Preparation; New ...   \n",
       "44         995  Computer Processing and Data Preparation; New ...   \n",
       "45         996                              城市电视台在新媒体环境下发展数据新闻的思考   \n",
       "46         997                                   大数据时代下新媒体的传播优势解析   \n",
       "47         998                                 大数据背景下传媒对我国公共体育的影响   \n",
       "48         999  Computer Processing and Data Preparation; New ...   \n",
       "49        1000  Computer Processing and Data Preparation; New ...   \n",
       "\n",
       "                   作者                                    刊名              发表时间  \\\n",
       "0                  仲岩                                科技创新导报  2016-12-27 18:57   \n",
       "1        石小川; 吴世文; 闫岩                                湖北社会科学  2016-12-27 13:59   \n",
       "2                 杨晓蕾                             旅游纵览(下半月)        2016-12-23   \n",
       "3                 张明昭                               中国管理信息化        2016-12-15   \n",
       "4                 闵会轩                                  新闻前哨        2016-12-15   \n",
       "5                  吴熙                                    安家        2016-12-15   \n",
       "6              门丽; 丁宁                                   科技风        2016-12-15   \n",
       "7                  韩嫕                                现代电视技术        2016-12-15   \n",
       "8                  安琪                                 黑龙江档案        2016-12-15   \n",
       "9                 缪和友                                中国传媒科技        2016-12-15   \n",
       "10        David Erdos            Journal of Law and Society        2016-12-15   \n",
       "11                 翟纯                                  教育导刊        2016-12-10   \n",
       "12                徐苏杭                                   今传媒        2016-12-05   \n",
       "13           关琮严; 宋春光                                  中国记者        2016-12-01   \n",
       "14                 李垣                                  声屏世界        2016-12-01   \n",
       "15                段汝林                                 教育现代化        2016-11-28   \n",
       "16                黄俊华                                   新闻界        2016-11-25   \n",
       "17                王允端                              思想理论教育导刊        2016-11-20   \n",
       "18                邓思思                                  中国广播        2016-11-20   \n",
       "19                胡士杰                                  创新科技        2016-11-15   \n",
       "20                NaN                  Computer Weekly News        2016-11-09   \n",
       "21       冀芳; 王召露; 张夏恒                                 科技与出版        2016-11-08   \n",
       "22                陈欣钢                                  电视研究        2016-11-05   \n",
       "23                NaN                                    求知        2016-11-05   \n",
       "24            陈积银; 杨廉                                  新闻记者        2016-11-05   \n",
       "25                顾翠芬                               中国管理信息化        2016-11-01   \n",
       "26                郎清平                              声屏世界·广告人        2016-11-01   \n",
       "27                NaN                        Science Letter        2016-10-28   \n",
       "28                于新玲                                  科技传播        2016-10-23   \n",
       "29        杨辉; 白青利; 张勇                                  石油教育        2016-10-20   \n",
       "30                NaN                                  东南传播        2016-10-20   \n",
       "31                汪跃文                                 传播与版权        2016-10-15   \n",
       "32                NaN                             环境保护与循环经济        2016-10-15   \n",
       "33                刘紫婷                          黄河水利职业技术学院学报        2016-10-15   \n",
       "34                 张勤                                    视听        2016-10-15   \n",
       "35                 刘芬                                中国传媒科技        2016-10-15   \n",
       "36                初小燕                                 新媒体研究  2016-10-12 14:12   \n",
       "37                刘江帆                                新闻研究导刊        2016-10-10   \n",
       "38                徐皞亮                                  新闻世界        2016-10-10   \n",
       "39                 吴量                                  科技传播        2016-10-08   \n",
       "40                王振宇                                  编辑之友        2016-10-05   \n",
       "41                NaN                                  中国储运        2016-10-01   \n",
       "42  于乐; 张凤寒; 李玉纯; 杨直凡                                中华文化论坛        2016-09-30   \n",
       "43                NaN  Computers, Networks & Communications        2016-09-29   \n",
       "44                NaN   Computers Networks & Communications        2016-09-29   \n",
       "45            赵慧; 蔡凌楚                                  科技传播        2016-09-23   \n",
       "46                刘力豪                                  科技传播        2016-09-23   \n",
       "47                吴立新                                  电子测试  2016-09-22 17:04   \n",
       "48                NaN  Computers, Networks & Communications        2016-09-22   \n",
       "49                NaN   Computers Networks & Communications        2016-09-22   \n",
       "\n",
       "      被引      下载   操作  \n",
       "0    1.0   103.0   下载  \n",
       "1   12.0   785.0   下载  \n",
       "2    2.0     5.0   下载  \n",
       "3    NaN    34.0   下载  \n",
       "4    NaN    94.0   下载  \n",
       "5    NaN    30.0   下载  \n",
       "6    NaN    85.0   下载  \n",
       "7    8.0   139.0   下载  \n",
       "8    5.0    86.0   下载  \n",
       "9    2.0    96.0   下载  \n",
       "10   NaN     NaN  NaN  \n",
       "11   1.0    99.0   下载  \n",
       "12  10.0   619.0   下载  \n",
       "13   2.0    90.0   下载  \n",
       "14   NaN    23.0   下载  \n",
       "15   1.0    84.0   下载  \n",
       "16   7.0   351.0   下载  \n",
       "17  27.0   747.0   下载  \n",
       "18   1.0   120.0   下载  \n",
       "19   2.0    82.0   下载  \n",
       "20   NaN     NaN  NaN  \n",
       "21  21.0   532.0   下载  \n",
       "22   3.0   448.0   下载  \n",
       "23   1.0    92.0   下载  \n",
       "24  11.0  1477.0   下载  \n",
       "25   5.0   193.0   下载  \n",
       "26   2.0   102.0   下载  \n",
       "27   NaN     NaN  NaN  \n",
       "28   3.0   134.0   下载  \n",
       "29   2.0   248.0   下载  \n",
       "30   NaN    13.0   下载  \n",
       "31  12.0   300.0   下载  \n",
       "32   NaN    62.0   下载  \n",
       "33   5.0   102.0   下载  \n",
       "34   7.0   414.0   下载  \n",
       "35  13.0  1127.0   下载  \n",
       "36   2.0   116.0   下载  \n",
       "37   1.0    27.0   下载  \n",
       "38   1.0   149.0   下载  \n",
       "39   6.0   121.0   下载  \n",
       "40   1.0   215.0   下载  \n",
       "41   NaN    61.0   下载  \n",
       "42   3.0   445.0   下载  \n",
       "43   NaN     NaN  NaN  \n",
       "44   NaN     NaN  NaN  \n",
       "45   1.0    25.0   下载  \n",
       "46   1.0   159.0   下载  \n",
       "47   NaN    69.0   下载  \n",
       "48   NaN     NaN  NaN  \n",
       "49   NaN     NaN  NaN  "
      ]
     },
     "execution_count": 50,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 抓取首页（第一页）页面信息\n",
    "element = driver.find_element_by_id('gridTable')\n",
    "含有页面主要数据的表格html_ = element.get_attribute('innerHTML')\n",
    "首页主要数据 = pd.read_html(含有页面主要数据的表格html_)[0]\n",
    "首页主要数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 翻页"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'下一页'"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "element = driver.find_element_by_id('PageNext')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [],
   "source": [
    "表格_html = dict()\n",
    "main_content =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19]\n"
     ]
    }
   ],
   "source": [
    "# 所有页数\n",
    "pages = list(range(1,20))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 函数（翻页）\n",
    "def process_pages (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        # 定位到“下一页”的按钮 ——> 点击\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        # 设定休息的时间 ——> 避免爬虫被禁报错、以及出现验证码\n",
    "        time.sleep(30+20*random())\n",
    "        # 获取含有页面主要数据的表格\n",
    "        element = driver.find_element_by_id('gridTable')\n",
    "        main_content = element.get_attribute('innerHTML')\n",
    "        表格_html[p] = main_content"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t11\t12\t13\t14\t15\t16\t17\t18\t19\t"
     ]
    }
   ],
   "source": [
    "# 翻页操作\n",
    "process_pages(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>html_snippets</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>5</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>6</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>7</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>8</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>9</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>11</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>12</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>13</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>14</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>15</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>16</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>17</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>18</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>19</th>\n",
       "      <td>\\n&lt;div class=\"toolbar\"&gt;&lt;div id=\"countPageDiv\" ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                        html_snippets\n",
       "1   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "2   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "3   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "4   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "5   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "6   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "7   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "8   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "9   \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "10  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "11  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "12  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "13  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "14  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "15  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "16  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "17  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "18  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ...\n",
       "19  \\n<div class=\"toolbar\"><div id=\"countPageDiv\" ..."
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "df = pd.DataFrame([表格_html]).T\n",
    "df.columns = [\"html_snippets\"]\n",
    "display(df)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 存储文件"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "网站 = \"中国知网\"\n",
    "# 指定内容输出的位置\n",
    "fn = { \"output\" : { \"htm_snippets\": \"CNKI homework/知网_htm_snippets_{网站}.tsv\"}\n",
    "     }"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 保存页面内容的csv文件\n",
    "filename = fn [\"output\"] [\"htm_snippets\"] \n",
    "df.to_csv(filename.format(网站=网站), sep=\"\\t\", encoding=\"utf8\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "l_df = []\n",
    "for p in pages:\n",
    "    表格 = pd.read_html(表格_html[p])[0]\n",
    "    l_df.append(表格)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 51,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>951</td>\n",
       "      <td>广电大数据平台建设思路及架构研究</td>\n",
       "      <td>仲岩</td>\n",
       "      <td>科技创新导报</td>\n",
       "      <td>2016-12-27 18:57</td>\n",
       "      <td>1.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>952</td>\n",
       "      <td>大数据背景下恐怖主义信息的新媒体传播研究:关键问题与重要议题</td>\n",
       "      <td>石小川; 吴世文; 闫岩</td>\n",
       "      <td>湖北社会科学</td>\n",
       "      <td>2016-12-27 13:59</td>\n",
       "      <td>12.0</td>\n",
       "      <td>785.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>953</td>\n",
       "      <td>互联网大数据背景下辽宁旅游目的地的推广研究</td>\n",
       "      <td>杨晓蕾</td>\n",
       "      <td>旅游纵览(下半月)</td>\n",
       "      <td>2016-12-23</td>\n",
       "      <td>2.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>954</td>\n",
       "      <td>国土资源工作中数据成果保密工作研究</td>\n",
       "      <td>张明昭</td>\n",
       "      <td>中国管理信息化</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>955</td>\n",
       "      <td>大数据背景下的媒介技术批判研究</td>\n",
       "      <td>闵会轩</td>\n",
       "      <td>新闻前哨</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>94.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>945</th>\n",
       "      <td>996</td>\n",
       "      <td>城市电视台在新媒体环境下发展数据新闻的思考</td>\n",
       "      <td>赵慧; 蔡凌楚</td>\n",
       "      <td>科技传播</td>\n",
       "      <td>2016-09-23</td>\n",
       "      <td>1.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>946</th>\n",
       "      <td>997</td>\n",
       "      <td>大数据时代下新媒体的传播优势解析</td>\n",
       "      <td>刘力豪</td>\n",
       "      <td>科技传播</td>\n",
       "      <td>2016-09-23</td>\n",
       "      <td>1.0</td>\n",
       "      <td>159.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>947</th>\n",
       "      <td>998</td>\n",
       "      <td>大数据背景下传媒对我国公共体育的影响</td>\n",
       "      <td>吴立新</td>\n",
       "      <td>电子测试</td>\n",
       "      <td>2016-09-22 17:04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>69.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>948</th>\n",
       "      <td>999</td>\n",
       "      <td>Computer Processing and Data Preparation; New ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computers, Networks &amp; Communications</td>\n",
       "      <td>2016-09-22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>949</th>\n",
       "      <td>1000</td>\n",
       "      <td>Computer Processing and Data Preparation; New ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computers Networks &amp; Communications</td>\n",
       "      <td>2016-09-22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                                 篇名  \\\n",
       "0           951                                   广电大数据平台建设思路及架构研究   \n",
       "1           952                     大数据背景下恐怖主义信息的新媒体传播研究:关键问题与重要议题   \n",
       "2           953                              互联网大数据背景下辽宁旅游目的地的推广研究   \n",
       "3           954                                  国土资源工作中数据成果保密工作研究   \n",
       "4           955                                    大数据背景下的媒介技术批判研究   \n",
       "..          ...                                                ...   \n",
       "945         996                              城市电视台在新媒体环境下发展数据新闻的思考   \n",
       "946         997                                   大数据时代下新媒体的传播优势解析   \n",
       "947         998                                 大数据背景下传媒对我国公共体育的影响   \n",
       "948         999  Computer Processing and Data Preparation; New ...   \n",
       "949        1000  Computer Processing and Data Preparation; New ...   \n",
       "\n",
       "               作者                                    刊名              发表时间  \\\n",
       "0              仲岩                                科技创新导报  2016-12-27 18:57   \n",
       "1    石小川; 吴世文; 闫岩                                湖北社会科学  2016-12-27 13:59   \n",
       "2             杨晓蕾                             旅游纵览(下半月)        2016-12-23   \n",
       "3             张明昭                               中国管理信息化        2016-12-15   \n",
       "4             闵会轩                                  新闻前哨        2016-12-15   \n",
       "..            ...                                   ...               ...   \n",
       "945       赵慧; 蔡凌楚                                  科技传播        2016-09-23   \n",
       "946           刘力豪                                  科技传播        2016-09-23   \n",
       "947           吴立新                                  电子测试  2016-09-22 17:04   \n",
       "948           NaN  Computers, Networks & Communications        2016-09-22   \n",
       "949           NaN   Computers Networks & Communications        2016-09-22   \n",
       "\n",
       "       被引     下载   操作  \n",
       "0     1.0  103.0   下载  \n",
       "1    12.0  785.0   下载  \n",
       "2     2.0    5.0   下载  \n",
       "3     NaN   34.0   下载  \n",
       "4     NaN   94.0   下载  \n",
       "..    ...    ...  ...  \n",
       "945   1.0   25.0   下载  \n",
       "946   1.0  159.0   下载  \n",
       "947   NaN   69.0   下载  \n",
       "948   NaN    NaN  NaN  \n",
       "949   NaN    NaN  NaN  \n",
       "\n",
       "[1000 rows x 8 columns]"
      ]
     },
     "execution_count": 51,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_url_out = pd.concat(l_df).reset_index(drop=True)\n",
    "df_总表 = 首页主要数据.append(df_url_out)\n",
    "df_总表"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 52,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Unnamed: 0</th>\n",
       "      <th>篇名</th>\n",
       "      <th>作者</th>\n",
       "      <th>刊名</th>\n",
       "      <th>发表时间</th>\n",
       "      <th>被引</th>\n",
       "      <th>下载</th>\n",
       "      <th>操作</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>951</td>\n",
       "      <td>广电大数据平台建设思路及架构研究</td>\n",
       "      <td>仲岩</td>\n",
       "      <td>科技创新导报</td>\n",
       "      <td>2016-12-27 18:57</td>\n",
       "      <td>1.0</td>\n",
       "      <td>103.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>952</td>\n",
       "      <td>大数据背景下恐怖主义信息的新媒体传播研究:关键问题与重要议题</td>\n",
       "      <td>石小川; 吴世文; 闫岩</td>\n",
       "      <td>湖北社会科学</td>\n",
       "      <td>2016-12-27 13:59</td>\n",
       "      <td>12.0</td>\n",
       "      <td>785.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>953</td>\n",
       "      <td>互联网大数据背景下辽宁旅游目的地的推广研究</td>\n",
       "      <td>杨晓蕾</td>\n",
       "      <td>旅游纵览(下半月)</td>\n",
       "      <td>2016-12-23</td>\n",
       "      <td>2.0</td>\n",
       "      <td>5.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>954</td>\n",
       "      <td>国土资源工作中数据成果保密工作研究</td>\n",
       "      <td>张明昭</td>\n",
       "      <td>中国管理信息化</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>34.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>955</td>\n",
       "      <td>大数据背景下的媒介技术批判研究</td>\n",
       "      <td>闵会轩</td>\n",
       "      <td>新闻前哨</td>\n",
       "      <td>2016-12-15</td>\n",
       "      <td>NaN</td>\n",
       "      <td>94.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>945</th>\n",
       "      <td>996</td>\n",
       "      <td>城市电视台在新媒体环境下发展数据新闻的思考</td>\n",
       "      <td>赵慧; 蔡凌楚</td>\n",
       "      <td>科技传播</td>\n",
       "      <td>2016-09-23</td>\n",
       "      <td>1.0</td>\n",
       "      <td>25.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>946</th>\n",
       "      <td>997</td>\n",
       "      <td>大数据时代下新媒体的传播优势解析</td>\n",
       "      <td>刘力豪</td>\n",
       "      <td>科技传播</td>\n",
       "      <td>2016-09-23</td>\n",
       "      <td>1.0</td>\n",
       "      <td>159.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>947</th>\n",
       "      <td>998</td>\n",
       "      <td>大数据背景下传媒对我国公共体育的影响</td>\n",
       "      <td>吴立新</td>\n",
       "      <td>电子测试</td>\n",
       "      <td>2016-09-22 17:04</td>\n",
       "      <td>NaN</td>\n",
       "      <td>69.0</td>\n",
       "      <td>下载</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>948</th>\n",
       "      <td>999</td>\n",
       "      <td>Computer Processing and Data Preparation; New ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computers, Networks &amp; Communications</td>\n",
       "      <td>2016-09-22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>949</th>\n",
       "      <td>1000</td>\n",
       "      <td>Computer Processing and Data Preparation; New ...</td>\n",
       "      <td>NaN</td>\n",
       "      <td>Computers Networks &amp; Communications</td>\n",
       "      <td>2016-09-22</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "      <td>NaN</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>1000 rows × 8 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "     Unnamed: 0                                                 篇名  \\\n",
       "0           951                                   广电大数据平台建设思路及架构研究   \n",
       "1           952                     大数据背景下恐怖主义信息的新媒体传播研究:关键问题与重要议题   \n",
       "2           953                              互联网大数据背景下辽宁旅游目的地的推广研究   \n",
       "3           954                                  国土资源工作中数据成果保密工作研究   \n",
       "4           955                                    大数据背景下的媒介技术批判研究   \n",
       "..          ...                                                ...   \n",
       "945         996                              城市电视台在新媒体环境下发展数据新闻的思考   \n",
       "946         997                                   大数据时代下新媒体的传播优势解析   \n",
       "947         998                                 大数据背景下传媒对我国公共体育的影响   \n",
       "948         999  Computer Processing and Data Preparation; New ...   \n",
       "949        1000  Computer Processing and Data Preparation; New ...   \n",
       "\n",
       "               作者                                    刊名              发表时间  \\\n",
       "0              仲岩                                科技创新导报  2016-12-27 18:57   \n",
       "1    石小川; 吴世文; 闫岩                                湖北社会科学  2016-12-27 13:59   \n",
       "2             杨晓蕾                             旅游纵览(下半月)        2016-12-23   \n",
       "3             张明昭                               中国管理信息化        2016-12-15   \n",
       "4             闵会轩                                  新闻前哨        2016-12-15   \n",
       "..            ...                                   ...               ...   \n",
       "945       赵慧; 蔡凌楚                                  科技传播        2016-09-23   \n",
       "946           刘力豪                                  科技传播        2016-09-23   \n",
       "947           吴立新                                  电子测试  2016-09-22 17:04   \n",
       "948           NaN  Computers, Networks & Communications        2016-09-22   \n",
       "949           NaN   Computers Networks & Communications        2016-09-22   \n",
       "\n",
       "       被引     下载   操作  \n",
       "0     1.0  103.0   下载  \n",
       "1    12.0  785.0   下载  \n",
       "2     2.0    5.0   下载  \n",
       "3     NaN   34.0   下载  \n",
       "4     NaN   94.0   下载  \n",
       "..    ...    ...  ...  \n",
       "945   1.0   25.0   下载  \n",
       "946   1.0  159.0   下载  \n",
       "947   NaN   69.0   下载  \n",
       "948   NaN    NaN  NaN  \n",
       "949   NaN    NaN  NaN  \n",
       "\n",
       "[1000 rows x 8 columns]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 将内容表格存在本地\n",
    "with pd.ExcelWriter('Selenium知网数据.xlsx',mode='w',engine=\"openpyxl\") as writer:  \n",
    "            df_总表.to_excel(writer,sheet_name=\"知网数据\")\n",
    "display(df_总表)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 尝试导出refworks文件（.txt） 和 下载原文\n",
    "# 经历过翻页以后 回去首页\n",
    "element = driver.find_element_by_id('total').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'下一页'"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 翻页\n",
    "element = driver.find_element_by_id('PageNext')\n",
    "element.get_attribute('innerHTML')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [],
   "source": [
    "导出_html = dict()\n",
    "main_content_ =\"\"\n",
    "element = None"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 分两次爬取信息，全选超过500遍下载不成功"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n"
     ]
    }
   ],
   "source": [
    "# 导出refworks文件（.txt）和下载文章 —— 页面选中（50）篇 —— 翻页在选中 （此操作一直循环） ！！注意：每次全选不能超过500篇，故此次爬取905篇文章，分2次进行\n",
    "pages = list(range(1,11))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选中页面50篇 —> 翻页\n",
    "def process_xuanzhong (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        全选 = driver.find_element_by_id('selectCheckAll1')\n",
    "        全选.click()\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(30+20*random())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1\t2\t3\t4\t5\t6\t7\t8\t9\t10\t"
     ]
    }
   ],
   "source": [
    "process_xuanzhong (pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出与分析 \n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出文献\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-r\"]').click()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 返回页面再次选取文章"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 点击Refworks\n",
    "element = driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['CDwindow-8322673B5971023DF6F39775220ACC51',\n",
       " 'CDwindow-F776314EA433BB279381CDFE42622F57',\n",
       " 'CDwindow-51C8032563DFC15915AC9378226E9A23']"
      ]
     },
     "execution_count": 62,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 所有窗口ID\n",
    "driver.window_handles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-63-520070efe65b>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[2])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[2])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出 .txt文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-65-0188c2a7ff70>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 批量下载\n",
    "element = driver.find_element_by_xpath('//li[@class=\"bulkdownload export\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-68-1f3bb34cc9cb>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[3])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[3])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 下载所选文献（500篇）\n",
    "element = driver.find_element_by_id('btn-download-all').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-70-8b82f1e63dce>:2: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 窗口切换 ——> 返回论文列表页\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 清除所选的500篇文献\n",
    "element = driver.find_element_by_xpath('//*[@id=\"gridTable\"]/div[1]/div[2]/div[1]/a').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[10, 11, 12, 13, 14, 15, 16, 17, 18]\n"
     ]
    }
   ],
   "source": [
    "pages = list(range(10,19))\n",
    "print(pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 73,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 选中页面50篇 —> 翻页\n",
    "def process_xuanzhong (pages):\n",
    "    for p in pages:\n",
    "        print (p,end='\\t')\n",
    "        全选 = driver.find_element_by_id('selectCheckAll1')\n",
    "        全选.click()\n",
    "        跳转 = driver.find_element_by_id('PageNext')\n",
    "        跳转.click()\n",
    "        time.sleep(30+20*random())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 74,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10\t11\t12\t13\t14\t15\t16\t17\t18\t"
     ]
    }
   ],
   "source": [
    "process_xuanzhong (pages)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 75,
   "metadata": {},
   "outputs": [],
   "source": [
    "# 导出与分析 \n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-d\"]').click()\n",
    "# 导出文献\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon-r\"]').click()\n",
    "# 点击Refworks\n",
    "element = driver.find_element_by_xpath('//a[@exporttype=\"Refworks\"]').click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 76,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-76-4731a2a1674f>:4: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[4])\n"
     ]
    }
   ],
   "source": [
    "# 所有窗口ID\n",
    "driver.window_handles\n",
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 77,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-77-8deab74ae236>:4: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 导出 .txt文件\n",
    "element = driver.find_element_by_xpath('//i[@class=\"icon icon-export\"]').click()\n",
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 80,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-80-343c4ecf3e24>:1: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n",
      "<ipython-input-80-343c4ecf3e24>:5: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[5])\n"
     ]
    }
   ],
   "source": [
    " driver.switch_to_window(driver.window_handles[1])\n",
    "# 批量下载\n",
    "element = driver.find_element_by_xpath('//li[@class=\"bulkdownload export\"]').click()\n",
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[5])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 81,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "<ipython-input-81-ab38f8ba609c>:4: DeprecationWarning: use driver.switch_to.window instead\n",
      "  driver.switch_to_window(driver.window_handles[1])\n"
     ]
    }
   ],
   "source": [
    "# 下载所选文献（450篇）\n",
    "element = driver.find_element_by_id('btn-download-all').click()\n",
    "# 窗口切换\n",
    "driver.switch_to_window(driver.window_handles[1])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2.2 处理验证码(若没弹出省略此步骤)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 82,
   "metadata": {},
   "outputs": [],
   "source": [
    "def baidu_API_OCR(image_url):\n",
    "    #1.获取百度token\n",
    "    #clinet_id 为官网获取的AK,client_secret为官网获取的SK\n",
    "    host ='https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=KhGnaGqXiUqmc17r5QxjZycN&client_secret=BkyIw69G4NwOz7YgCoeOdp4bNO9E6dN'\n",
    "    response = requests.get(host)\n",
    "    if response:\n",
    "#     print(response.json())\n",
    "        access_token = response.json()[\"access_token\"]\n",
    "    #请求图片验证信息\n",
    "    request_url = \"https://aip.baidubce.com/rest/2.0/ocr/v1/accurate_basic\"\n",
    "    params = {\n",
    "        \"url\":image_url\n",
    "    }\n",
    "    request_url = request_url + \"?access_token=\" + access_token\n",
    "    headers = {'content-type': 'application/x-www-form-urlencoded'}\n",
    "    response = requests.post(request_url, data=params, headers=headers)\n",
    "    response"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.3"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
